Load required packages

Load the dataset

file_path <- "C:/Users/dongl/OneDrive/Desktop/Predict_melting_point/Predict_melting_point.xlsx - Descriptors - Copy.xlsx"

descriptor_data <- readxl::read_excel(file_path)

Data cleaning # Rename columns Ave °C target_var # Handle missing value rows # Handle duplicate rows # Remove columns with more than 95% zero values # store clean data in descriptor_data_cleaned

## Number of rows removed due to missing values: 5
## Number of duplicate rows found: 0
## tibble [2,701 × 151] (S3: tbl_df/tbl/data.frame)
##  $ target_var              : num [1:2701] -161.5 -117 -23 -80.8 -73 ...
##  $ SMILES                  : chr [1:2701] "C1(CCC1)C" "CN(C)C" "ClC(Cl)(Cl)Cl" "C#C" ...
##  $ MaxAbsEStateIndex       : num [1:2701] 2.31 2 4.83 4 5.31 2.21 3.83 3.8 5.47 5.4 ...
##  $ MaxEStateIndex          : num [1:2701] 2.31 2 4.83 4 5.31 2.21 3.83 3.8 5.47 5.4 ...
##  $ MinAbsEStateIndex       : num [1:2701] 1.06 2 1.61 4 0.2 1.27 0.31 0.78 0.86 0.89 ...
##  $ MinEStateIndex          : num [1:2701] 1.06 2 -1.61 4 -1.72 1.27 0.31 0.78 0.86 0.89 ...
##  $ qed                     : num [1:2701] 0.41 0.38 0.47 0.33 0.56 0.48 0.41 0.47 0.49 0.43 ...
##  $ SPS                     : num [1:2701] 25.8 9 12 1 11.3 ...
##  $ MolWt                   : num [1:2701] 70.1 59.1 153.8 26 248.8 ...
##  $ HeavyAtomMolWt          : num [1:2701] 60.1 50 153.8 24 248.8 ...
##  $ ExactMolWt              : num [1:2701] 70.1 59.1 151.9 26 245.8 ...
##  $ NumValenceElectrons     : num [1:2701] 30 26 32 10 54 48 42 42 32 30 ...
##  $ MaxPartialCharge        : num [1:2701] -0.04 -0.01 0.27 -0.12 0.23 -0.03 -0.02 -0.03 0.09 0.03 ...
##  $ MinPartialCharge        : num [1:2701] -0.06 -0.31 -0.07 -0.12 -0.08 -0.08 -0.1 -0.1 -0.13 -0.13 ...
##  $ MaxAbsPartialCharge     : num [1:2701] 0.06 0.31 0.27 0.12 0.23 0.08 0.1 0.1 0.13 0.13 ...
##  $ MinAbsPartialCharge     : num [1:2701] 0.04 0.01 0.07 0.12 0.08 0.03 0.02 0.03 0.09 0.03 ...
##  $ FpDensityMorgan1        : num [1:2701] 1.4 1 0.8 1 0.89 1.13 1.29 1.57 1.67 1.6 ...
##  $ FpDensityMorgan2        : num [1:2701] 1.8 1 0.8 1 1.22 1.63 1.43 2 2.5 2.2 ...
##  $ FpDensityMorgan3        : num [1:2701] 1.8 1 0.8 1 1.22 1.75 1.43 2 2.67 2.2 ...
##  $ BCUT2D_MWHI             : num [1:2701] 14.2 15 35.6 12.6 35.6 ...
##  $ BCUT2D_MWLOW            : num [1:2701] 9.88 11.01 11.84 11.43 10.71 ...
##  $ BCUT2D_CHGHI            : num [1:2701] 2.09 1.58 2.11 0.45 2.26 1.87 2.08 1.9 1.77 2.18 ...
##  $ BCUT2D_CHGLO            : num [1:2701] -2.18 -1.9 -1.91 -0.7 -2.07 -1.98 -2.16 -1.99 -1.62 -1.7 ...
##  $ BCUT2D_LOGPHI           : num [1:2701] 2.24 1.47 2.29 0.58 2.4 2.07 2.21 2.05 2.13 2.29 ...
##  $ BCUT2D_LOGPLOW          : num [1:2701] -2.04 -1.99 -1.81 -0.58 -2.01 -1.77 -2.04 -1.84 -1.29 -1.67 ...
##  $ BCUT2D_MRHI             : num [1:2701] 4.62 4.09 6.83 4.47 6.75 5.05 4.99 4.88 7.14 6.18 ...
##  $ BCUT2D_MRLOW            : num [1:2701] 0.35 0.51 1.76 3.31 1.42 0.89 0.51 0.64 2.02 0.99 ...
##  $ AvgIpc                  : num [1:2701] 1.3 0.81 0.72 1 1.62 1.67 1.31 1.27 1.86 2.12 ...
##  $ BalabanJ                : num [1:2701] 2.08 2.32 3.02 3 4.4 3.62 3.79 3.13 3.05 2 ...
##  $ BertzCT                 : num [1:2701] 27.02 8 19.12 4.75 125.73 ...
##  $ Chi0                    : num [1:2701] 3.7 3.58 4.5 2 7.65 6.57 6.08 5.86 4.41 3.7 ...
##  $ Chi0n                   : num [1:2701] 3.7 3.45 2.01 1.15 3.77 6.41 5.71 5.49 3.02 3.08 ...
##  $ Chi0v                   : num [1:2701] 3.7 3.45 5.04 1.15 8.3 6.41 5.71 5.49 4.59 3.83 ...
##  $ Chi1                    : num [1:2701] 2.39 1.73 2 1 3.85 3.68 2.94 3.13 2.89 2.43 ...
##  $ Chi1n                   : num [1:2701] 2.39 1.34 0.76 0.33 1.63 3.31 2.6 2.77 1.58 1.99 ...
##  $ Chi1v                   : num [1:2701] 2.39 1.34 2.27 0.33 3.9 3.31 2.6 2.77 2.84 2.53 ...
##  $ Chi2n                   : num [1:2701] 2.04 1.34 0.43 0 1.07 2.53 3.03 2.56 0.92 1.6 ...
##  $ Chi2v                   : num [1:2701] 2.04 1.34 3.86 0 4.68 2.53 3.03 2.56 2.5 1.91 ...
##  $ Chi3n                   : num [1:2701] 1.39 0 0 0 0.41 1.48 1.28 0.76 0.49 0.92 ...
##  $ Chi3v                   : num [1:2701] 1.39 0 0 0 2.32 1.48 1.28 0.76 1.53 1.35 ...
##  $ Chi4n                   : num [1:2701] 0.61 0 0 0 0.11 0.63 0 0.7 0.26 0.27 ...
##  $ Chi4v                   : num [1:2701] 0.61 0 0 0 0.96 0.63 0 0.7 0.93 0.58 ...
##  $ HallKierAlpha           : num [1:2701] 0 -0.04 1.16 -0.44 1.48 -0.26 -0.26 -0.26 -0.01 0.29 ...
##  $ Ipc                     : num [1:2701] 10.39 3.25 3.61 2 50.07 ...
##  $ Kappa1                  : num [1:2701] 3.2 3.96 6.16 1.56 10.48 ...
##  $ Kappa2                  : num [1:2701] 1 1.3 1.74 0.56 3.75 3.71 1.69 2.84 1.63 1.17 ...
##  $ Kappa3                  : num [1:2701] 0.44 1128.96 38.29 -4.71 4.02 ...
##  $ LabuteASA               : num [1:2701] 33.2 27.2 50 14.1 82.6 ...
##  $ PEOE_VSA1               : num [1:2701] 0 4.9 0 0 0 0 0 0 0 0 ...
##  $ PEOE_VSA10              : num [1:2701] 0 0 0 0 4.49 0 0 0 0 0 ...
##  $ PEOE_VSA11              : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
##  $ PEOE_VSA12              : num [1:2701] 0 0 0 0 3.79 0 0 0 0 0 ...
##  $ PEOE_VSA13              : num [1:2701] 0 0 3.25 0 0 0 0 0 0 0 ...
##  $ PEOE_VSA14              : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
##  $ PEOE_VSA2               : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
##  $ PEOE_VSA3               : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
##  $ PEOE_VSA4               : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
##  $ PEOE_VSA5               : num [1:2701] 0 0 0 12.8 0 ...
##  $ PEOE_VSA6               : num [1:2701] 26.2 0 46.4 0 69.6 ...
##  $ PEOE_VSA7               : num [1:2701] 5.92 21.14 0 0 0 ...
##  $ PEOE_VSA8               : num [1:2701] 0 0 0 0 0 0 0 0 0 5.88 ...
##  $ PEOE_VSA9               : num [1:2701] 0 0 0 0 5.03 0 0 0 4.34 0 ...
##  $ SMR_VSA1                : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
##  $ SMR_VSA10               : num [1:2701] 0 0 46.4 0 69.6 ...
##  $ SMR_VSA3                : num [1:2701] 0 4.9 0 0 0 0 0 0 0 0 ...
##  $ SMR_VSA4                : num [1:2701] 5.92 0 0 0 0 0 5.41 5.92 0 5.92 ...
##  $ SMR_VSA5                : num [1:2701] 26.19 0 3.25 0 3.79 ...
##  $ SMR_VSA6                : num [1:2701] 0 21.1 0 0 0 ...
##  $ SMR_VSA7                : num [1:2701] 0 0 0 0 9.52 ...
##  $ SMR_VSA9                : num [1:2701] 0 0 0 12.8 0 ...
##  $ SlogP_VSA1              : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
##  $ SlogP_VSA10             : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
##  $ SlogP_VSA11             : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
##  $ SlogP_VSA12             : num [1:2701] 0 0 46.4 0 69.6 ...
##  $ SlogP_VSA2              : num [1:2701] 0 26.04 3.25 0 3.79 ...
##  $ SlogP_VSA3              : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
##  $ SlogP_VSA4              : num [1:2701] 5.92 0 0 12.85 0 ...
##  $ SlogP_VSA5              : num [1:2701] 26.2 0 0 0 0 ...
##  $ SlogP_VSA6              : num [1:2701] 0 0 0 0 9.52 ...
##  $ SlogP_VSA7              : num [1:2701] 0 0 0 0 0 0 0 0 4.34 0 ...
##  $ SlogP_VSA8              : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
##  $ TPSA                    : num [1:2701] 0 3.24 0 0 0 0 0 0 0 0 ...
##  $ EState_VSA1             : num [1:2701] 0 0 3.25 0 3.79 0 0 0 0 0 ...
##  $ EState_VSA10            : num [1:2701] 0 0 0 0 0 0 0 0 0 0 ...
##  $ EState_VSA2             : num [1:2701] 0 0 0 0 9.52 0 0 0 0 0 ...
##  $ EState_VSA3             : num [1:2701] 0 0 0 0 0 0 5.41 0 0 0 ...
##  $ EState_VSA4             : num [1:2701] 5.92 0 0 0 0 0 0 5.92 4.34 11.8 ...
##  $ EState_VSA5             : num [1:2701] 19.3 0 0 0 0 ...
##  $ EState_VSA6             : num [1:2701] 0 0 0 0 0 ...
##  $ EState_VSA7             : num [1:2701] 0 26 0 0 0 ...
##  $ EState_VSA8             : num [1:2701] 6.92 0 0 12.85 0 ...
##  $ EState_VSA9             : num [1:2701] 0 0 46.4 0 69.6 ...
##  $ VSA_EState1             : num [1:2701] 0 0 -1.61 0 -1.96 0 0 0 0.86 0 ...
##  $ VSA_EState10            : num [1:2701] 0 0 19.3 0 31.4 ...
##  $ VSA_EState2             : num [1:2701] 0 2 0 0 0 0 0 0 0 0 ...
##  $ VSA_EState3             : num [1:2701] 0 0 0 0 -0.2 0 0 0 1.95 0 ...
##  $ VSA_EState4             : num [1:2701] 0 0 0 0 0 3.03 1.55 1.29 0 0 ...
##  $ VSA_EState5             : num [1:2701] 1.06 0 0 0 0 0 0 0.78 0 1.79 ...
##  $ VSA_EState6             : num [1:2701] 0 0 0 0 0 0 0 0 3.79 0 ...
##   [list output truncated]
##  - attr(*, "na.action")= 'omit' Named int [1:5] 277 337 686 1134 2608
##   ..- attr(*, "names")= chr [1:5] "277" "337" "686" "1134" ...

Summary column statistics

# Calculate column statistics for all columns except 'target_var'
columns_to_analyze <- setdiff(names(descriptor_data_cleaned), c('D1'))  # Assuming D1 represents 'target_var'

column_stats <- data.frame(Column = columns_to_analyze, Count = numeric(length(columns_to_analyze)), 
                           Distinct = numeric(length(columns_to_analyze)), Min = numeric(length(columns_to_analyze)),
                           Max = numeric(length(columns_to_analyze)), Average = numeric(length(columns_to_analyze)),
                           Std_Dev = numeric(length(columns_to_analyze)), Range = numeric(length(columns_to_analyze)),
                           Zero_Values = numeric(length(columns_to_analyze)), stringsAsFactors = FALSE)

for (i in seq_along(columns_to_analyze)) {
  col <- columns_to_analyze[i]
  column_data <- descriptor_data_cleaned[[col]]
  if (is.numeric(column_data)) {
    column_stats[i, ] <- list(
      Column = col,
      Count = length(column_data),
      Distinct = length(unique(column_data)),
      Min = min(column_data, na.rm = TRUE),
      Max = max(column_data, na.rm = TRUE),
      Average = mean(column_data, na.rm = TRUE),
      Std_Dev = sd(column_data, na.rm = TRUE),
      Range = max(column_data, na.rm = TRUE) - min(column_data, na.rm = TRUE),
      Zero_Values = sum(column_data == 0)
    )
  }
}

print(column_stats)
##                       Column Count Distinct     Min           Max       Average
## 1                 target_var  2701     1902 -187.75  4.376500e+02  6.014403e+01
## 2                     SMILES     0        0    0.00  0.000000e+00  0.000000e+00
## 3          MaxAbsEStateIndex  2701      794    0.00  1.382000e+01  8.163965e+00
## 4             MaxEStateIndex  2701      794    0.00  1.382000e+01  8.163961e+00
## 5          MinAbsEStateIndex  2701      192    0.00  7.810000e+00  5.138282e-01
## 6             MinEStateIndex  2701      469   -8.73  4.000000e+00 -1.501814e-01
## 7                        qed  2701       85    0.03  9.100000e-01  5.504258e-01
## 8                        SPS  2701      537    0.00  5.246000e+01  1.089180e+01
## 9                      MolWt  2701     1385   16.04  9.591700e+02  1.760413e+02
## 10            HeavyAtomMolWt  2701      915   12.01  9.591700e+02  1.654050e+02
## 11                ExactMolWt  2701     1411   16.03  9.491800e+02  1.756890e+02
## 12       NumValenceElectrons  2701       93    8.00  3.380000e+02  6.203036e+01
## 13          MaxPartialCharge  2701       69   -0.12  9.500000e-01  1.735024e-01
## 14          MinPartialCharge  2701       52   -0.63 -1.000000e-02 -3.288856e-01
## 15       MaxAbsPartialCharge  2701       62    0.03  9.500000e-01  3.354572e-01
## 16       MinAbsPartialCharge  2701       49    0.00  4.900000e-01  1.737579e-01
## 17          FpDensityMorgan1  2701      146    0.11  2.000000e+00  1.138371e+00
## 18          FpDensityMorgan2  2701      186    0.18  2.670000e+00  1.655046e+00
## 19          FpDensityMorgan3  2701      221    0.25  3.220000e+00  2.034546e+00
## 20               BCUT2D_MWHI  2701      272   12.01  1.269300e+02  2.714047e+01
## 21              BCUT2D_MWLOW  2701      184    9.49  1.201000e+01  1.025002e+01
## 22              BCUT2D_CHGHI  2701      147   -0.08  2.950000e+00  2.003436e+00
## 23              BCUT2D_CHGLO  2701      129   -2.56 -8.000000e-02 -1.978571e+00
## 24             BCUT2D_LOGPHI  2701      145    0.14  2.630000e+00  2.107593e+00
## 25            BCUT2D_LOGPLOW  2701      156   -2.88  1.400000e-01 -1.945539e+00
## 26               BCUT2D_MRHI  2701      361    2.50  1.428000e+01  6.256309e+00
## 27              BCUT2D_MRLOW  2701      258   -1.11  4.670000e+00  4.202555e-01
## 28                    AvgIpc  2701      173    0.00  3.580000e+00  1.986246e+00
## 29                  BalabanJ  2701      270    0.00  6.610000e+00  2.827016e+00
## 30                   BertzCT  2701     1693    0.00  1.935570e+03  2.657365e+02
## 31                      Chi0  2701      423    0.00  4.154000e+01  8.606549e+00
## 32                     Chi0n  2701      887    0.00  3.864000e+01  6.661999e+00
## 33                     Chi0v  2701      932    0.00  3.864000e+01  7.191896e+00
## 34                      Chi1  2701      525    0.00  2.803000e+01  5.498308e+00
## 35                     Chi1n  2701      688    0.00  2.554000e+01  3.717397e+00
## 36                     Chi1v  2701      728    0.00  2.618000e+01  4.104180e+00
## 37                     Chi2n  2701      588    0.00  1.746000e+01  2.612525e+00
## 38                     Chi2v  2701      643    0.00  2.662000e+01  3.058541e+00
## 39                     Chi3n  2701      452    0.00  1.161000e+01  1.630563e+00
## 40                     Chi3v  2701      509    0.00  2.833000e+01  1.958597e+00
## 41                     Chi4n  2701      344    0.00  7.760000e+00  1.014558e+00
## 42                     Chi4v  2701      388    0.00  3.150000e+01  1.244017e+00
## 43             HallKierAlpha  2701      346   -5.53  3.040000e+00 -9.939689e-01
## 44                       Ipc  2701      904    0.00  1.270636e+13  5.169880e+09
## 45                    Kappa1  2701      922    0.00  5.541000e+01  8.614528e+00
## 46                    Kappa2  2701      714    0.00  4.709000e+01  4.077205e+00
## 47                    Kappa3  2701      619  -27.04  9.507960e+03  1.226420e+01
## 48                 LabuteASA  2701     1696    8.74  3.548100e+02  7.174526e+01
## 49                 PEOE_VSA1  2701      115    0.00  3.041000e+01  4.550944e+00
## 50                PEOE_VSA10  2701      147   -0.06  3.448000e+01  2.539941e+00
## 51                PEOE_VSA11  2701       77    0.00  4.654000e+01  1.451466e+00
## 52                PEOE_VSA12  2701       64    0.00  3.490000e+01  9.226435e-01
## 53                PEOE_VSA13  2701       44    0.00  2.024000e+01  8.589448e-01
## 54                PEOE_VSA14  2701       66    0.00  4.789000e+01  1.968256e+00
## 55                 PEOE_VSA2  2701       74    0.00  3.959000e+01  3.004198e+00
## 56                 PEOE_VSA3  2701       68    0.00  4.390000e+01  1.814061e+00
## 57                 PEOE_VSA4  2701       40    0.00  7.903000e+01  1.023265e+00
## 58                 PEOE_VSA5  2701       41    0.00  4.640000e+01  9.953536e-01
## 59                 PEOE_VSA6  2701      375    0.00  2.835200e+02  2.313013e+01
## 60                 PEOE_VSA7  2701      380    0.00  1.593000e+02  1.704308e+01
## 61                 PEOE_VSA8  2701      342    0.00  6.533000e+01  6.731229e+00
## 62                 PEOE_VSA9  2701      257    0.00  5.286000e+01  5.006709e+00
## 63                  SMR_VSA1  2701      182    0.00  7.903000e+01  6.868097e+00
## 64                 SMR_VSA10  2701      387    0.00  1.593000e+02  1.233369e+01
## 65                  SMR_VSA3  2701       59    0.00  2.582000e+01  1.378767e+00
## 66                  SMR_VSA4  2701       33    0.00  3.173000e+01  9.714291e-01
## 67                  SMR_VSA5  2701      280   -0.06  2.965500e+02  1.471167e+01
## 68                  SMR_VSA6  2701      152    0.00  5.732000e+01  4.180300e+00
## 69                  SMR_VSA7  2701      446    0.00  2.426500e+02  2.770715e+01
## 70                  SMR_VSA9  2701       56    0.00  4.556000e+01  2.626797e+00
## 71                SlogP_VSA1  2701       90    0.00  4.149000e+01  2.356649e+00
## 72               SlogP_VSA10  2701       48    0.00  7.903000e+01  2.285054e+00
## 73               SlogP_VSA11  2701        9    0.00  2.300000e+01  1.565046e+00
## 74               SlogP_VSA12  2701       80    0.00  1.593000e+02  6.197627e+00
## 75                SlogP_VSA2  2701      542   -0.06  9.962000e+01  1.032320e+01
## 76                SlogP_VSA3  2701      116    0.00  3.228000e+01  3.197146e+00
## 77                SlogP_VSA4  2701       80    0.00  5.817000e+01  3.714136e+00
## 78                SlogP_VSA5  2701      258    0.00  2.904500e+02  1.661442e+01
## 79                SlogP_VSA6  2701      245    0.00  2.426500e+02  2.191129e+01
## 80                SlogP_VSA7  2701       26    0.00  5.238000e+01  1.021625e+00
## 81                SlogP_VSA8  2701       64    0.00  6.463000e+01  1.855302e+00
## 82                      TPSA  2701      357    0.00  2.058400e+02  3.224628e+01
## 83               EState_VSA1  2701      254    0.00  6.930000e+01  3.877297e+00
## 84              EState_VSA10  2701       75    0.00  7.903000e+01  4.689278e+00
## 85               EState_VSA2  2701      255    0.00  6.135000e+01  4.412133e+00
## 86               EState_VSA3  2701      263    0.00  7.585000e+01  5.004228e+00
## 87               EState_VSA4  2701      329   -0.06  7.304000e+01  6.781866e+00
## 88               EState_VSA5  2701      266    0.00  2.696700e+02  1.160126e+01
## 89               EState_VSA6  2701      129    0.00  7.280000e+01  7.177453e+00
## 90               EState_VSA7  2701      138    0.00  9.878000e+01  9.438038e+00
## 91               EState_VSA8  2701      353    0.00  1.941200e+02  1.143315e+01
## 92               EState_VSA9  2701      143    0.00  1.160100e+02  6.625409e+00
## 93               VSA_EState1  2701      693   -2.17  2.206400e+02  4.352107e+00
## 94              VSA_EState10  2701      484   -0.82  6.086000e+01  2.017930e+00
## 95               VSA_EState2  2701      857   -1.90  5.838000e+01  6.977001e+00
## 96               VSA_EState3  2701      897   -3.66  4.021000e+01  5.022847e+00
## 97               VSA_EState4  2701      714   -6.44  2.046000e+01  1.405935e+00
## 98               VSA_EState5  2701      472  -52.81  1.046000e+01 -1.038467e-01
## 99               VSA_EState6  2701     1081   -1.37  8.417000e+01  5.867523e+00
## 100              VSA_EState7  2701      726  -16.43  6.238000e+01  1.765391e+00
## 101              VSA_EState8  2701      655   -4.84  2.316000e+01  1.798967e+00
## 102              VSA_EState9  2701      272  -15.55  8.260000e+00  1.360459e-01
## 103             FractionCSP3  2701       77    0.00  1.000000e+00  3.424361e-01
## 104           HeavyAtomCount  2701       42    1.00  5.700000e+01  1.153906e+01
## 105                NHOHCount  2701        9    0.00  8.000000e+00  7.571270e-01
## 106                  NOCount  2701       13    0.00  1.400000e+01  2.041096e+00
## 107  NumAliphaticCarbocycles  2701        5    0.00  4.000000e+00  6.442058e-02
## 108 NumAliphaticHeterocycles  2701        4    0.00  3.000000e+00  7.034432e-02
## 109        NumAliphaticRings  2701        5    0.00  4.000000e+00  1.347649e-01
## 110   NumAromaticCarbocycles  2701        8    0.00  8.000000e+00  8.130322e-01
## 111  NumAromaticHeterocycles  2701        4    0.00  3.000000e+00  1.469826e-01
## 112         NumAromaticRings  2701        8    0.00  8.000000e+00  9.600148e-01
## 113            NumHAcceptors  2701       10    0.00  9.000000e+00  1.713069e+00
## 114               NumHDonors  2701        6    0.00  6.000000e+00  6.297668e-01
## 115           NumHeteroatoms  2701       16    0.00  1.800000e+01  2.725287e+00
## 116        NumRotatableBonds  2701       34    0.00  4.700000e+01  2.210663e+00
## 117        NumSaturatedRings  2701        5    0.00  4.000000e+00  7.182525e-02
## 118                RingCount  2701        8    0.00  9.000000e+00  1.094780e+00
## 119                  MolLogP  2701      658   -2.67  1.741000e+01  2.239630e+00
## 120                    MolMR  2701     1809    6.73  2.427300e+02  4.682242e+01
## 121                fr_Al_COO  2701        4    0.00  3.000000e+00  6.627175e-02
## 122                 fr_Al_OH  2701        4    0.00  3.000000e+00  6.960385e-02
## 123                   fr_ArN  2701        3    0.00  2.000000e+00  8.404295e-02
## 124                fr_Ar_COO  2701        3    0.00  2.000000e+00  5.923732e-02
## 125                  fr_Ar_N  2701        5    0.00  4.000000e+00  1.754906e-01
## 126                 fr_Ar_OH  2701        5    0.00  4.000000e+00  1.299519e-01
## 127                   fr_COO  2701        4    0.00  3.000000e+00  1.255091e-01
## 128                  fr_COO2  2701        4    0.00  3.000000e+00  1.258793e-01
## 129                   fr_C_O  2701        5    0.00  5.000000e+00  4.683451e-01
## 130             fr_C_O_noCOO  2701        5    0.00  5.000000e+00  3.469086e-01
## 131                   fr_NH0  2701        7    0.00  6.000000e+00  3.680118e-01
## 132                   fr_NH1  2701        5    0.00  4.000000e+00  1.395779e-01
## 133                   fr_NH2  2701        4    0.00  4.000000e+00  1.395779e-01
## 134          fr_alkyl_halide  2701       10    0.00  1.800000e+01  1.921511e-01
## 135                 fr_amide  2701        6    0.00  5.000000e+00  1.255091e-01
## 136               fr_aniline  2701        5    0.00  4.000000e+00  1.514254e-01
## 137           fr_aryl_methyl  2701        5    0.00  4.000000e+00  1.832655e-01
## 138               fr_benzene  2701        8    0.00  8.000000e+00  8.126620e-01
## 139              fr_bicyclic  2701        7    0.00  6.000000e+00  1.766013e-01
## 140                 fr_ester  2701        4    0.00  3.000000e+00  8.922621e-02
## 141                 fr_ether  2701        7    0.00  6.000000e+00  2.450944e-01
## 142               fr_halogen  2701       12    0.00  1.800000e+01  5.601629e-01
## 143                fr_ketone  2701        3    0.00  2.000000e+00  8.996668e-02
## 144        fr_ketone_Topliss  2701        3    0.00  2.000000e+00  6.108849e-02
## 145               fr_methoxy  2701        5    0.00  4.000000e+00  1.129211e-01
## 146                 fr_nitro  2701        5    0.00  4.000000e+00  9.514994e-02
## 147            fr_nitro_arom  2701        4    0.00  3.000000e+00  8.552388e-02
## 148    fr_para_hydroxylation  2701        5    0.00  4.000000e+00  1.395779e-01
## 149                fr_phenol  2701        5    0.00  4.000000e+00  1.251388e-01
## 150   fr_phenol_noOrthoHbond  2701        5    0.00  4.000000e+00  1.184746e-01
## 151         fr_unbrch_alkane  2701       34    0.00  4.100000e+01  7.171418e-01
##          Std_Dev        Range Zero_Values
## 1   9.346898e+01 6.254000e+02           1
## 2   0.000000e+00 0.000000e+00           0
## 3   3.356041e+00 1.382000e+01           1
## 4   3.356047e+00 1.382000e+01           1
## 5   5.218991e-01 7.810000e+00          28
## 6   1.270644e+00 1.273000e+01           9
## 7   1.344203e-01 8.800000e-01           0
## 8   4.084920e+00 5.246000e+01           1
## 9   7.577216e+01 9.431300e+02           0
## 10  7.261552e+01 9.471600e+02           0
## 11  7.552626e+01 9.331500e+02           0
## 12  2.663677e+01 3.300000e+02           0
## 13  1.367367e-01 1.070000e+00          72
## 14  1.514091e-01 6.200000e-01           0
## 15  1.511506e-01 9.200000e-01           0
## 16  1.196618e-01 4.900000e-01          72
## 17  3.719377e-01 1.890000e+00           0
## 18  4.731753e-01 2.490000e+00           0
## 19  5.595734e-01 2.970000e+00           0
## 20  2.253253e+01 1.149200e+02           0
## 21  3.151043e-01 2.520000e+00           0
## 22  2.048932e-01 3.030000e+00           0
## 23  1.697027e-01 2.480000e+00           0
## 24  2.119771e-01 2.490000e+00           0
## 25  2.367239e-01 3.020000e+00           0
## 26  1.652226e+00 1.178000e+01           0
## 27  6.320404e-01 5.780000e+00           4
## 28  3.938985e-01 3.580000e+00           1
## 29  5.131153e-01 6.610000e+00           1
## 30  2.037001e+02 1.935570e+03           4
## 31  3.510255e+00 4.154000e+01           1
## 32  3.077225e+00 3.864000e+01           1
## 33  3.108888e+00 3.864000e+01           1
## 34  2.490393e+00 2.803000e+01           1
## 35  2.021705e+00 2.554000e+01           1
## 36  2.156157e+00 2.618000e+01           1
## 37  1.523367e+00 1.746000e+01          10
## 38  1.897245e+00 2.662000e+01          10
## 39  1.120048e+00 1.161000e+01          73
## 40  1.589385e+00 2.833000e+01          73
## 41  8.281185e-01 7.760000e+00         193
## 42  1.424883e+00 3.150000e+01         193
## 43  8.631283e-01 8.570000e+00         102
## 44  2.456623e+11 1.270636e+13           1
## 45  3.981716e+00 5.541000e+01           1
## 46  3.387920e+00 4.709000e+01           2
## 47  2.257735e+02 9.535000e+03           6
## 48  2.977018e+01 3.460700e+02           0
## 49  4.748006e+00 3.041000e+01        1105
## 50  4.384323e+00 3.454000e+01        1843
## 51  3.938004e+00 4.654000e+01        2260
## 52  2.918698e+00 3.490000e+01        2382
## 53  2.543814e+00 2.024000e+01        2381
## 54  3.904804e+00 4.789000e+01        1996
## 55  4.650954e+00 3.959000e+01        1657
## 56  3.994742e+00 4.390000e+01        2034
## 57  3.831925e+00 7.903000e+01        2412
## 58  3.751215e+00 4.640000e+01        2478
## 59  2.661100e+01 2.835200e+02         670
## 60  1.366446e+01 1.593000e+02         381
## 61  7.461736e+00 6.533000e+01        1040
## 62  6.751778e+00 5.286000e+01        1397
## 63  7.140983e+00 7.903000e+01         925
## 64  1.282037e+01 1.593000e+02         679
## 65  3.415393e+00 2.582000e+01        2219
## 66  2.769780e+00 3.173000e+01        2338
## 67  2.530551e+01 2.966100e+02        1054
## 68  7.372412e+00 5.732000e+01        1691
## 69  2.301097e+01 2.426500e+02         725
## 70  5.138712e+00 4.556000e+01        1969
## 71  4.160890e+00 4.149000e+01        1874
## 72  5.064532e+00 7.903000e+01        2012
## 73  3.706333e+00 2.300000e+01        2202
## 74  1.210005e+01 1.593000e+02        1900
## 75  9.912812e+00 9.968000e+01         613
## 76  4.984146e+00 3.228000e+01        1687
## 77  6.570836e+00 5.817000e+01        1822
## 78  2.480933e+01 2.904500e+02         759
## 79  2.053384e+01 2.426500e+02         755
## 80  3.725404e+00 5.238000e+01        2395
## 81  5.583768e+00 6.463000e+01        2344
## 82  2.661784e+01 2.058400e+02         574
## 83  7.107046e+00 6.930000e+01        1738
## 84  6.195649e+00 7.903000e+01        1249
## 85  6.151865e+00 6.135000e+01        1456
## 86  6.897207e+00 7.585000e+01        1357
## 87  8.798462e+00 7.310000e+01        1175
## 88  2.098993e+01 2.696700e+02        1186
## 89  1.052613e+01 7.280000e+01        1531
## 90  1.508731e+01 9.878000e+01        1560
## 91  1.778714e+01 1.941200e+02        1164
## 92  9.298870e+00 1.160100e+02        1143
## 93  1.165540e+01 2.228100e+02        1661
## 94  4.383032e+00 6.168000e+01        1930
## 95  8.001439e+00 6.028000e+01        1148
## 96  6.458722e+00 4.387000e+01        1120
## 97  2.453432e+00 2.690000e+01         950
## 98  2.111076e+00 6.327000e+01        1182
## 99  6.661747e+00 8.554000e+01         875
## 100 5.014952e+00 7.881000e+01        1501
## 101 2.748039e+00 2.800000e+01        1328
## 102 1.211063e+00 2.381000e+01        2281
## 103 3.665076e-01 1.000000e+00         803
## 104 5.065904e+00 5.600000e+01           0
## 105 1.014100e+00 8.000000e+00        1464
## 106 1.743606e+00 1.400000e+01         574
## 107 2.740579e-01 4.000000e+00        2540
## 108 2.779785e-01 3.000000e+00        2526
## 109 3.853549e-01 4.000000e+00        2371
## 110 8.179879e-01 8.000000e+00        1046
## 111 3.880855e-01 3.000000e+00        2335
## 112 8.835286e-01 8.000000e+00         863
## 113 1.391592e+00 9.000000e+00         529
## 114 7.859335e-01 6.000000e+00        1438
## 115 1.982158e+00 1.800000e+01         282
## 116 3.483566e+00 4.700000e+01         774
## 117 2.906353e-01 4.000000e+00        2523
## 118 9.482413e-01 9.000000e+00         692
## 119 1.680500e+00 2.008000e+01           5
## 120 2.086092e+01 2.360000e+02           0
## 121 2.796409e-01 3.000000e+00        2543
## 122 2.999472e-01 3.000000e+00        2545
## 123 3.030232e-01 2.000000e+00        2494
## 124 2.438290e-01 2.000000e+00        2546
## 125 5.639863e-01 4.000000e+00        2401
## 126 4.151693e-01 4.000000e+00        2420
## 127 3.613003e-01 3.000000e+00        2389
## 128 3.616838e-01 3.000000e+00        2388
## 129 6.701419e-01 5.000000e+00        1676
## 130 6.188241e-01 5.000000e+00        1950
## 131 7.196183e-01 6.000000e+00        1989
## 132 4.381692e-01 4.000000e+00        2406
## 133 3.955208e-01 4.000000e+00        2367
## 134 8.356132e-01 1.800000e+01        2474
## 135 4.707962e-01 5.000000e+00        2467
## 136 4.384599e-01 4.000000e+00        2366
## 137 5.186462e-01 4.000000e+00        2343
## 138 8.181295e-01 8.000000e+00        1047
## 139 5.513489e-01 6.000000e+00        2367
## 140 3.493361e-01 3.000000e+00        2513
## 141 6.005391e-01 6.000000e+00        2223
## 142 1.194040e+00 1.800000e+01        1921
## 143 3.283731e-01 2.000000e+00        2493
## 144 2.631155e-01 2.000000e+00        2552
## 145 4.288482e-01 4.000000e+00        2489
## 146 3.456323e-01 4.000000e+00        2483
## 147 3.238907e-01 3.000000e+00        2504
## 148 4.415373e-01 4.000000e+00        2411
## 149 4.090201e-01 4.000000e+00        2432
## 150 4.009656e-01 4.000000e+00        2448
## 151 3.039731e+00 4.100000e+01        2413

Visualize column statistics contribute to the target variable and compare trend and identify pattern

# Prepare data for plotting 'Distinct' values vs 'target_var'
column_stats_distinct <- column_stats[, c("Column", "Distinct")]
column_stats_distinct$target_var <- descriptor_data_cleaned$target_var[1:nrow(column_stats_distinct)]

# Apply log transformation to Distinct to enhance visibility of small changes
column_stats_distinct$Distinct <- log1p(column_stats_distinct$Distinct)

# Normalize the values separately
normalize <- function(x) {
  return((x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE)))
}

column_stats_distinct$Distinct <- normalize(column_stats_distinct$Distinct)
column_stats_distinct$target_var <- normalize(column_stats_distinct$target_var)

# Reshape data for plotting
melted_stats <- reshape2::melt(column_stats_distinct, id.vars = "Column")

# Create interactive plot with Plotly
p <- ggplot(melted_stats, aes(x = Column, y = value, color = variable, group = variable)) +
  geom_line() +
  geom_point() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Distinct Values and 'target_var' Comparison (Log Transformed & Normalized)", x = "Descriptors", y = "Normalized Values", color = "Metric") +
  theme_minimal()

# Convert ggplot to Plotly plot
p_plotly <- ggplotly(p)
p_plotly

Visualize column statistics average vs ‘target_var’

# Prepare data for plotting average vs 'target_var'
column_stats_avg <- column_stats[, c("Column", "Average")]
column_stats_avg$target_var <- descriptor_data_cleaned$target_var[1:nrow(column_stats_avg)]

# Apply log transformation to Average to enhance visibility of small changes
column_stats_avg$Average <- log1p(column_stats_avg$Average)
## Warning in log1p(column_stats_avg$Average): NaNs produced
# Normalize the values separately
column_stats_avg$Average <- normalize(column_stats_avg$Average)
column_stats_avg$target_var <- normalize(column_stats_avg$target_var)

# Reshape data for plotting average
melted_avg <- reshape2::melt(column_stats_avg, id.vars = "Column")

# Create interactive plot with Plotly
p_avg <- ggplot(melted_avg, aes(x = Column, y = value, color = variable, group = variable)) +
  geom_line() +
  geom_point() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Average and 'target_var' Comparison (Log Transformed & Normalized)", x = "Descriptors", y = "Normalized Values", color = "Metric") +
  theme_minimal()

# Convert ggplot to Plotly plot
p_avg_plotly <- ggplotly(p_avg)
p_avg_plotly

Visualize column statistics Standard Deviation vs ‘target_var’

# Prepare data for plotting standard deviation vs 'target_var'
column_stats_std <- column_stats[, c("Column", "Std_Dev")]
column_stats_std$target_var <- descriptor_data_cleaned$target_var[1:nrow(column_stats_std)]

# Apply log transformation to Std_Dev to enhance visibility of small changes
column_stats_std$Std_Dev <- log1p(column_stats_std$Std_Dev)

# Normalize the values separately
column_stats_std$Std_Dev <- normalize(column_stats_std$Std_Dev)
column_stats_std$target_var <- normalize(column_stats_std$target_var)

# Reshape data for plotting standard deviation
melted_std <- reshape2::melt(column_stats_std, id.vars = "Column")

# Create interactive plot with Plotly
p_std <- ggplot(melted_std, aes(x = Column, y = value, color = variable, group = variable)) +
  geom_line() +
  geom_point() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Standard Deviation and 'target_var' Comparison (Log Transformed & Normalized)", x = "Descriptors", y = "Normalized Values", color = "Metric") +
  theme_minimal()

# Convert ggplot to Plotly plot
p_std_plotly <- ggplotly(p_std)
p_std_plotly